import json
import urllib.parse
import urllib.request
import pandas as pd
from numpy import nan


class Wiki(object):
    def __init__(self):
        self.extract_dict = dict()
        self.ner = pd.DataFrame()
        self.init_table()

    @staticmethod
    def call_wiki(sentence, lang="en", threshold=0.8):
        data = urllib.parse.urlencode([
            ("text", sentence), ("lang", lang),
            ("userKey", "jlrmbketzgmljvdxgfdjhwmkghifsa"),
            ("pageRankSqThreshold", "%g" % threshold), ("applyPageRankSqThreshold", "true"),
            ("nTopDfValuesToIgnore", "200"), ("nWordsToIgnoreFromList", "200"),
            ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
            ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
            ("includeCosines", "false"), ("maxMentionEntropy", "3")
        ])
        url = "http://www.wikifier.org/annotate-article"
        req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
        with urllib.request.urlopen(req, timeout=10) as f:
            response = f.read()
            response = json.loads(response.decode("utf8"))
        return response

    @staticmethod
    def tokenize(text):
        try:
            items = text.split('/')[-1]
            return items
        except Exception as e:
            print("Couldn't parse")
            print("Reason", e)

    def extract(self, text, sen):
        res = {}
        is_success = 0

        try:
            res = self.call_wiki(sen)
            is_success = 1
        except Exception as e:
            print("Call wiki failed")
            print("Reason", e)

        if is_success:
            for annotation in res.get("annotations", []):
                support_list = []
                for support in annotation["support"]:
                    support_list.append(sen[support["chFrom"]:support["chTo"] + 1])
                # print(support_list)
                # print(text)
                if text in support_list or text.strip('the ') in support_list:
                    return self.tokenize(annotation["url"])
            return 'unknown'

    def init_table(self):
        print("loading abstract...")
        extract_table = pd.read_csv("./data/abstract.csv")
        extract_table.set_index("word", inplace=True)
        self.extract_dict = extract_table.to_dict('index')
        print("finished")

        print("loading ner...")
        ner = pd.read_excel("./data/NER_US_ORG_PERSON.xlsx")
        ner.drop("Unnamed: 0", axis=1, inplace=True)
        ner["tag"] = ner["abstract"].apply(lambda x: "sentence" if x is nan else "dbpedia")
        ner["wiki_abstract_key"] = nan
        self.ner = ner
        print("finished")

    def run(self, flag=0):
        if flag:
            print("load cash...")
            self.ner = pd.read_csv("./cash/unlimited_wikifier.csv")
            self.ner.drop("Unnamed: 0", axis=1, inplace=True)
        for i, dp in self.ner.iterrows():
            if int(i) < flag:
                continue
            text = dp["TEXT"]
            tag = dp["tag"]
            sentence = dp["Sentence"]

            if tag == 'dbpedia':
                print("skip dbpedia")
                continue

            key_label = self.extract(text, sentence)
            self.ner.loc[i, 'wiki_abstract_key'] = key_label
            value_label = self.extract_dict.get(key_label, 0)

            if value_label == 0:
                print("not found abstract")
                continue

            self.ner.loc[i, 'abstract'] = value_label['abstract']

            self.ner.loc[i, 'tag'] = 'tagger_to_finish'
            if int(i) % 30 == 0:
                print("saving...")
                self.ner.to_csv("./cash/unlimited_wikifier.csv")

            print(i, text, key_label)

        print("saving...")
        self.ner.to_csv("./output/unlimited_wikifier.csv")
        self.ner.to_excel("./output/unlimited_wikifier.xlsx")
        print("finished")

    def url_parse(self):
        self.ner = pd.read_csv('./output/unlimited_wikifier.csv')
        for i, dp in self.ner.iterrows():
            tag = dp['tag']
            wiki_abstract_key = dp['wiki_abstract_key']
            if tag == 'sentence':
                if wiki_abstract_key != 'unknown':
                    fix_wiki_abstract_key = 'unknown'
                    try:
                        fix_wiki_abstract_key = urllib.parse.unquote(wiki_abstract_key)
                    except Exception as e:
                        print(e)
                        print("abs key: ", wiki_abstract_key)
                    self.ner.loc[i,'wiki_abstract_key'] = fix_wiki_abstract_key
                    value_label = self.extract_dict.get(fix_wiki_abstract_key, 0)

                    if value_label == 0:
                        print("not found abstract")
                        continue

                    self.ner.loc[i, 'abstract'] = value_label['abstract']

                    self.ner.loc[i, 'tag'] = 'tagger_to_finish'
        self.ner.to_csv('./output/fix.csv')
        self.ner.to_excel('./output/fix.xlsx')
